{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q xgboost==0.90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r spark_processing_job_s3_output_prefix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous Spark Processing Job Name: amazon-reviews-spark-processor-2020-03-30-04-12-49\n"
     ]
    }
   ],
   "source": [
    "print('Previous Spark Processing Job Name: {}'.format(spark_processing_job_s3_output_prefix))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Specify the S3 Location of the Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix_train = '{}/output/tfidf-train'.format(spark_processing_job_s3_output_prefix)\n",
    "prefix_validation = '{}/output/tfidf-validation'.format(spark_processing_job_s3_output_prefix)\n",
    "prefix_test = '{}/output/tfidf-test'.format(spark_processing_job_s3_output_prefix)\n",
    "\n",
    "balanced_tfidf_without_header_train_path = './{}'.format(prefix_train)\n",
    "balanced_tfidf_without_header_validation_path = './{}'.format(prefix_validation)\n",
    "balanced_tfidf_without_header_test_path = './{}'.format(prefix_test)\n",
    "\n",
    "balanced_tfidf_without_header_train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)\n",
    "balanced_tfidf_without_header_validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)\n",
    "balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/amazon-reviews-spark-processor-2020-03-30-04-12-49/output/tfidf-train', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}\n",
      "{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/amazon-reviews-spark-processor-2020-03-30-04-12-49/output/tfidf-validation', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}\n",
      "{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/amazon-reviews-spark-processor-2020-03-30-04-12-49/output/tfidf-test', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}\n"
     ]
    }
   ],
   "source": [
    "s3_input_train_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_train_s3_uri, content_type='text/csv')\n",
    "s3_input_validation_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_validation_s3_uri, content_type='text/csv')\n",
    "s3_input_test_data = sagemaker.s3_input(s3_data=balanced_tfidf_without_header_test_s3_uri, content_type='text/csv')\n",
    "\n",
    "print(s3_input_train_data.config)\n",
    "print(s3_input_validation_data.config)\n",
    "print(s3_input_test_data.config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "import os\r\n",
      "import argparse\r\n",
      "import pickle as pkl\r\n",
      "import pandas as pd\r\n",
      "from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix\r\n",
      "from sklearn import metrics\r\n",
      "from sklearn.base import BaseEstimator, TransformerMixin\r\n",
      "import nltk\r\n",
      "import re\r\n",
      "import xgboost as xgb\r\n",
      "from xgboost import XGBClassifier\r\n",
      "import glob\r\n",
      "\r\n",
      "\r\n",
      "def load_dataset(path, sep, header):\r\n",
      "    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)\r\n",
      "\r\n",
      "    labels = data.iloc[:,0]\r\n",
      "    features = data.drop(data.columns[0], axis=1)\r\n",
      "    \r\n",
      "    if header==None:\r\n",
      "        # Adjust the column names after dropped the 0th column above\r\n",
      "        # New column names are 0 (inclusive) to len(features.columns) (exclusive)\r\n",
      "        new_column_names = list(range(0, len(features.columns)))\r\n",
      "        features.columns = new_column_names\r\n",
      "\r\n",
      "    return features, labels\r\n",
      "\r\n",
      "\r\n",
      "def model_fn(model_dir):\r\n",
      "    \"\"\"\r\n",
      "    :param: model_dir The directory where model files are stored.\r\n",
      "    :return: a model\r\n",
      "    \"\"\"\r\n",
      "    # IsADirectoryError: [Errno 21] Is a directory: '/opt/ml/model'\r\n",
      "    import os\r\n",
      "    list_dirs = os.listdir(model_dir)\r\n",
      "    for file in dirs:\r\n",
      "        print(file)\r\n",
      "\r\n",
      "    model = pkl.load(open(model_dir, 'rb'))\r\n",
      "\r\n",
      "    print(type(model))\r\n",
      "    \r\n",
      "    return model\r\n",
      "\r\n",
      "\r\n",
      "def input_fn(request_body, request_content_type):\r\n",
      "    \"\"\"\r\n",
      "    Deserialize the Invoke request body into an object we can perform prediction on\r\n",
      "    \"\"\"\r\n",
      "    \"\"\"An input_fn that loads a pickled object\"\"\"\r\n",
      "    if request_content_type == \"application/json\":\r\n",
      "        pass\r\n",
      "    else:\r\n",
      "        # Handle other content-types here or raise an Exception\r\n",
      "        # if the content type is not supported.\r\n",
      "        pass\r\n",
      "\r\n",
      "    print(request_body)    \r\n",
      "    return [1]\r\n",
      "\r\n",
      "\r\n",
      "def predict_fn(input_object, model):\r\n",
      "    \"\"\"\r\n",
      "    Perform prediction on the deserialized object, with the loaded model\r\n",
      "    \"\"\"\r\n",
      "    return [1]\r\n",
      "\r\n",
      "\r\n",
      "def output_fn(output, output_content_type):\r\n",
      "    \"\"\"\r\n",
      "    Serialize the prediction result into the desired response content type\r\n",
      "    \"\"\"\r\n",
      "    #return json.dumps({'output':output.reshape(-1).tolist()}), output_content_type\r\n",
      "    print(output)\r\n",
      "    return [1]\r\n",
      "\r\n",
      "\r\n",
      "if __name__ == '__main__':\r\n",
      "    parser = argparse.ArgumentParser()\r\n",
      "    parser.add_argument('--objective', type=str, default='binary:logistic')\r\n",
      "    parser.add_argument('--max-depth', type=int, default=5)\r\n",
      "    parser.add_argument('--num-round', type=int, default=1)   \r\n",
      "    parser.add_argument('--train-data', type=str, default=os.environ['SM_CHANNEL_TRAIN'])\r\n",
      "    parser.add_argument('--validation-data', type=str, default=os.environ['SM_CHANNEL_VALIDATION'])\r\n",
      "    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])\r\n",
      "\r\n",
      "    args, _ = parser.parse_known_args()   \r\n",
      "    objective  = args.objective    \r\n",
      "    max_depth  = args.max_depth\r\n",
      "    num_round  = args.num_round\r\n",
      "    train_data   = args.train_data\r\n",
      "    validation_data = args.validation_data    \r\n",
      "    model_dir  = args.model_dir\r\n",
      "    \r\n",
      "    # Load transformed features (is_positive_sentiment, f0, f1, ...)    \r\n",
      "    X_train, y_train = load_dataset(train_data, ',', header=None)\r\n",
      "    X_validation, y_validation = load_dataset(validation_data, ',', header=None)\r\n",
      "\r\n",
      "    xgb_estimator = XGBClassifier(objective=objective,\r\n",
      "                                  num_round=num_round,\r\n",
      "                                  max_depth=max_depth)\r\n",
      "\r\n",
      "    xgb_estimator.fit(X_train, y_train)\r\n",
      "\r\n",
      "    # TODO:  use the model_dir that is passed in through args\r\n",
      "    #        (currently SM_MODEL_DIR)\r\n",
      "    os.makedirs(model_dir, exist_ok=True)\r\n",
      "    model_path = os.path.join(model_dir, 'xgboost-model')\r\n",
      "\r\n",
      "    pkl.dump(xgb_estimator, open(model_path, 'wb'))\r\n",
      "    print('Wrote model to {}'.format(model_path))\r\n",
      "    \r\n",
      "    xgb_estimator_restored = pkl.load(open(model_path, 'rb'))\r\n",
      "    type(xgb_estimator_restored) \r\n",
      "    \r\n",
      "    preds_validation = xgb_estimator_restored.predict(X_validation)\r\n",
      "    print('Validation Accuracy: ', accuracy_score(y_validation, preds_validation))\r\n",
      "    print('Validation Precision: ', precision_score(y_validation, preds_validation, average=None))\r\n",
      "    \r\n",
      "    print(classification_report(y_validation, preds_validation))\r\n",
      "\r\n",
      "    # TODO:  Convert to preds_validation_0_or_1\r\n",
      "\r\n",
      "    ##############\r\n",
      "#   Note:  roc_auc is causing the following:\r\n",
      "#   ValueError: multiclass format is not supported\r\n",
      "#     Traceback (most recent call last):\r\n",
      "#   File \"/miniconda3/lib/python3.6/runpy.py\", line 193, in _run_module_as_main\r\n",
      "#     \"__main__\", mod_spec)\r\n",
      "#   File \"/miniconda3/lib/python3.6/runpy.py\", line 85, in _run_code\r\n",
      "#     exec(code, run_globals)\r\n",
      "#   File \"/opt/ml/code/xgboost_reviews.py\", line 75, in <module>\r\n",
      "#     auc = round(metrics.roc_auc_score(y_validation, preds_validation), 4)\r\n",
      "#   File \"/miniconda3/lib/python3.6/site-packages/sklearn/metrics/ranking.py\", line 356, in roc_auc_score\r\n",
      "#     sample_weight=sample_weight)\r\n",
      "#   File \"/miniconda3/lib/python3.6/site-packages/sklearn/metrics/base.py\", line 74, in _average_binary_score\r\n",
      "#     raise ValueError(\"{0} format is not supported\".format(y_type))\r\n",
      " \r\n",
      "#    auc = round(metrics.roc_auc_score(y_validation, preds_validation), 4)\r\n",
      "#    print('AUC is ' + repr(auc))\r\n"
     ]
    }
   ],
   "source": [
    "!cat src/xgboost_reviews.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.xgboost import XGBoost\n",
    "\n",
    "model_output_path = 's3://{}/models/amazon-reviews/script-mode/training-runs'.format(bucket)\n",
    "\n",
    "xgb_estimator = XGBoost(entry_point='xgboost_reviews.py', \n",
    "                        source_dir='src/',\n",
    "                        role=role,\n",
    "                        train_instance_count=1,\n",
    "                        train_instance_type='ml.c5.9xlarge',\n",
    "                        framework_version='0.90-2',\n",
    "                        py_version='py3',\n",
    "                        output_path=model_output_path,\n",
    "                        hyperparameters={'objective':'binary:logistic',\n",
    "                                         'num_round': 1,\n",
    "                                         'max_depth': 5},\n",
    "                        enable_cloudwatch_metrics=True\n",
    "                       )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_estimator.fit(inputs={'train': s3_input_train_data, \n",
    "                          'validation': s3_input_validation_data}, wait=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training_job_name:  sagemaker-xgboost-2020-03-30-22-14-40-855\n"
     ]
    }
   ],
   "source": [
    "training_job_name = xgb_estimator.latest_training_job.name\n",
    "print('training_job_name:  {}'.format(training_job_name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<b>Review <a href=\"https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logStream:group=/aws/sagemaker/TrainingJobs;prefix=sagemaker-xgboost-2020-03-30-22-14-40-855;streamFilter=typeLogStreamPrefix\">CloudWatch Logs</a> After About 5 Minutes</b>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "\n",
    "display(HTML('<b>Review <a href=\"https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix\">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from sagemaker.xgboost import XGBoost\n",
    "\n",
    "xgb_estimator = XGBoost.attach(training_job_name=training_job_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predict with the Model\n",
    "Invoke the endpoint that was deployed during the Pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.describe_endpoint(EndpointName=training_job_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm_runtime = boto3.Session().client(service_name='sagemaker-runtime', region_name=region)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload = 'This item is great!'\n",
    "\n",
    "response = sm_runtime.invoke_endpoint(\n",
    "    EndpointName=training_job_name,\n",
    "    Body=payload.encode('utf-8'),\n",
    "    ContentType='text/csv')['Body'].read()\n",
    "\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def serializer(df):\n",
    "    return [1]\n",
    "#     feature_dim = customer_index.shape[0] + product_index.shape[0] + 1\n",
    "#     js = {'instances': []}\n",
    "#     for index, data in df.iterrows():\n",
    "#         js['instances'].append({'data': {'features': {'values': [1, 1, data['days_since_first']],\n",
    "#                                                       'keys': [data['user'], data['item'], feature_dim - 1],\n",
    "#                                                       'shape': [feature_dim]}}})\n",
    "#     return json.dumps(js)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def deserializer(df):\n",
    "    return [1]\n",
    "#     feature_dim = customer_index.shape[0] + product_index.shape[0] + 1\n",
    "#     js = {'instances': []}\n",
    "#     for index, data in df.iterrows():\n",
    "#         js['instances'].append({'data': {'features': {'values': [1, 1, data['days_since_first']],\n",
    "#                                                       'keys': [data['user'], data['item'], feature_dim - 1],\n",
    "#                                                       'shape': [feature_dim]}}})\n",
    "#     return json.dumps(js)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.predictor import json_deserializer\n",
    "\n",
    "xgb_predictor.content_type = 'text/csv'\n",
    "xgb_predictor.serializer = serializer\n",
    "xgb_predictor.deserializer = deserializer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_predictor.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
